﻿#include "pch.h"
#include "CodeEncoding.h"

Encoding CodeEncoding::Check(const std::string& str)
{
	bool bAllASCII = false;
	if (IsGBK(str, &bAllASCII))
	{
		return bAllASCII ? Encoding::ASCII : Encoding::Ansi;
	}

	if (IsUtf8(str))
	{
		return Encoding::UTF8;
	}

	return Encoding::Undefined;
}


Encoding CodeEncoding::BOMCheck(const unsigned char* bom, size_t size)
{
	static const unsigned char utf8Bom[]{ 0xef, 0xbb, 0xbf };
	size_t c = sizeof(utf8Bom) / sizeof(utf8Bom[0]);
	if (size >= c)
	{
		bool bequal = true;
		for (size_t i = 0; i < c; ++i)
		{
			if (utf8Bom[i] != bom[i])
			{
				bequal = false;
				break;
			}
		}
		if (bequal)
		{
			return Encoding::UTF8WithBOM;
		}
	}
	
	static const unsigned char utf16LEBom[]{ 0xff, 0xfe };
	c = sizeof(utf16LEBom) / sizeof(utf16LEBom[0]);
	if (size >= c)
	{
		bool bequal = true;
		for (size_t i = 0; i < c; ++i)
		{
			if (utf16LEBom[i] != bom[i])
			{
				bequal = false;
				break;
			}
		}
		if (bequal)
		{
			return Encoding::UTF16LE;
		}
	}

	static const unsigned char utf16BEBom[]{ 0xfe, 0xff };
	c = sizeof(utf16BEBom) / sizeof(utf16BEBom[0]);
	if (size >= c)
	{
		bool bequal = true;
		for (size_t i = 0; i < c; ++i)
		{
			if (utf16BEBom[i] != bom[i])
			{
				bequal = false;
				break;
			}
		}
		if (bequal)
		{
			return Encoding::UTF16BE;
		}
	}

	static const unsigned char utf32LEBom[]{ 0xff, 0xfe, 0x00, 0x00 };
	c = sizeof(utf32LEBom) / sizeof(utf32LEBom[0]);
	if (size >= c)
	{
		bool bequal = true;
		for (size_t i = 0; i < c; ++i)
		{
			if (utf32LEBom[i] != bom[i])
			{
				bequal = false;
				break;
			}
		}
		if (bequal)
		{
			return Encoding::UTF32LE;
		}
	}

	static const unsigned char utf32BEBom[]{ 0x00, 0x00, 0xfe, 0xff };
	c = sizeof(utf32BEBom) / sizeof(utf32BEBom[0]);
	if (size >= c)
	{
		bool bequal = true;
		for (size_t i = 0; i < c; ++i)
		{
			if (utf32BEBom[i] != bom[i])
			{
				bequal = false;
				break;
			}
		}
		if (bequal)
		{
			return Encoding::UTF32BE;
		}
	}
	
	return Encoding::Undefined;
}

int preNUm(unsigned char byte) 
{
	unsigned char mask = 0x80;
	int num = 0;
	for (int i = 0; i < 8; i++) 
	{
		if ((byte & mask) == mask) 
		{
			mask = mask >> 1;
			num++;
		}
		else 
		{
			break;
		}
	}
	return num;
}

bool CodeEncoding::IsUtf8(const std::string& strs)
{
	// https://blog.csdn.net/weixin_38595946/article/details/103128955

	const char* data = strs.data();
	int len = strs.length();
	int num = 0;
	int i = 0;
	while (i < len) {
		if ((data[i] & 0x80) == 0x00) {
			// 0XXX_XXXX
			i++;
			continue;
		}
		else if ((num = preNUm(data[i])) > 2) {
			// 110X_XXXX 10XX_XXXX
			// 1110_XXXX 10XX_XXXX 10XX_XXXX
			// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
			// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
			// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
			// preNUm() 返回首个字节8个bits中首�?0bit前面1bit的个数，该数量也是该字符所使用的字节数        
			i++;
			for (int j = 0; j < num - 1; j++) {
				//判断后面num - 1 个字节是不是都是10开
				if ((data[i] & 0xc0) != 0x80) {
					return false;
				}
				i++;
			}
		}
		else {
			//其他情况说明不是utf-8
			return false;
		}
	}
	return true;
}

bool CodeEncoding::IsGBK(const std::string& strs, bool* pAllAscii/* = nullptr*/)
{
	// https://www.jb51.net/article/128576.htm

	unsigned int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个
	bool bAllAscii = true; //如果全部都是ASCII,
	char chr;
	const char* str = strs.data();

	for (unsigned int i = 0; i < strs.length(); ++i)
	{
		chr = *(str + i);
		if ((chr & 0x80) != 0 && nBytes == 0)
		{// 判断是否ASCII编码,如果不是,说明有可能是GBK
			bAllAscii = false;
		}

		if (nBytes == 0)
		{
			if (chr >= 0x80)
			{
				if (chr >= 0x81 && chr <= 0xFE)
				{
					nBytes = +2;
				}
				else
				{
					return false;
				}
				nBytes--;
			}
		}
		else
		{
			if (chr < 0x40 || chr>0xFE)
			{
				return false;
			}
			nBytes--;
		}//else end
	}

	if (nBytes != 0)
	{   //违返规则
		return false;
	}

	if (bAllAscii)
	{ //如果全部都是ASCII, 也是GBK
		return true;
	}

	return true;
}
